generated from amazon-archives/__template_MIT-0
-
Notifications
You must be signed in to change notification settings - Fork 173
254 lines (214 loc) · 9.18 KB
/
fsdp-regression-test-venv.yml
File metadata and controls
254 lines (214 loc) · 9.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
name: FSDP Regression Test (venv)
# TODO: Additional test cases to matrix. Change max-parallel.
on:
push:
branches: [ "main" ]
paths:
- '3.test_cases/pytorch/FSDP/**'
pull_request:
paths:
- '3.test_cases/pytorch/FSDP/**'
workflow_dispatch:
env:
AWS_REGION: us-east-1
SLURM_HOST: p5en.smml.aiml.aws.dev
SLURM_USER: ghactions
AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
BASE_PATH: /fsx/agents/pr-reviews
HOME_PATH: /home/ghactions
permissions:
id-token: write
contents: read
jobs:
regression:
strategy:
fail-fast: true
max-parallel: 3
matrix:
cluster: [p5, p5-smhp]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
cancel-in-progress: false
timeout-minutes: 375
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
path: source-code
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Setup SSH Key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
chmod 600 ~/.ssh/slurm_key
# Add host to known hosts with retry
for i in {1..5}; do
if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH keyscan successful"
break
fi
echo "SSH keyscan attempt $i failed, retrying..."
sleep 5
done
- name: Setup Environment Variables
id: setup
run: |
BUILD_ID="${{ github.run_id }}"
REMOTE_TEST_PATH="${{ env.BASE_PATH }}/venv-tests/${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
LOG_DIR="${{ env.HOME_PATH }}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
CHECKPOINT_DIR="${{ env.BASE_PATH }}/checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
echo "remote_test_path=$REMOTE_TEST_PATH" >> $GITHUB_OUTPUT
echo "log_dir=$LOG_DIR" >> $GITHUB_OUTPUT
echo "checkpoint_dir=$CHECKPOINT_DIR" >> $GITHUB_OUTPUT
echo "REMOTE_TEST_PATH=$REMOTE_TEST_PATH" >> $GITHUB_ENV
echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV
echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV
- name: Create Remote Directories
run: |
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
mkdir -p ${{ env.REMOTE_TEST_PATH }}
mkdir -p ${{ env.LOG_DIR }}
mkdir -p ${{ env.CHECKPOINT_DIR }}
chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
EOF
- name: Transfer Code to Cluster
run: |
# Transfer code with retry
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_TEST_PATH }}/; then
echo "Code transfer successful"
break
fi
echo "Transfer attempt $i failed, retrying..."
sleep 10
done
- name: Create Virtual Environment on Cluster
run: |
FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
echo "Creating virtual environment on cluster..."
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
set -e
cd $FSDP_SLURM_DIR
bash ./create_venv.sh
echo "Virtual environment created successfully!"
EOF
- name: Prepare and Submit Slurm Job
id: submit_job
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
SBATCH_FILE="${{ matrix.model_config }}-training.sbatch"
TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
# Prepare and submit job on cluster
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
set -e
cd $FSDP_SLURM_DIR
if [ ! -f "$SBATCH_FILE" ]; then
echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
exit 1
fi
cp "$SBATCH_FILE" "$TMP_SBATCH"
# Modify sbatch script
sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
# Activate venv in the sbatch script
sed -i '1a source env/bin/activate' "$TMP_SBATCH"
# Submit job
echo "Submitting Slurm job..."
JOB_ID=\$(sbatch --parsable \$TMP_SBATCH)
echo "JOB_ID=\$JOB_ID" >> ${{ env.REMOTE_TEST_PATH }}/job_info.txt
echo "Submitted job: \$JOB_ID"
EOF
# Get job ID
sleep 2
JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"cat ${{ env.REMOTE_TEST_PATH }}/job_info.txt | grep JOB_ID | cut -d= -f2")
echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT
echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV
echo "Submitted Slurm job: $JOB_ID"
- name: Monitor Job with Real-time Logs
id: monitor_job
run: |
echo "Monitoring job ${{ env.JOB_ID }}..."
START_TIME=$(date +%s)
TIMEOUT=21600 # 6 hours
LOG_FILE="${{ env.LOG_DIR }}/regression_test_${{ env.JOB_ID }}.out"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -gt $TIMEOUT ]; then
echo "Timeout reached after 6 hours"
exit 1
fi
# Check job status
JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"squeue -j ${{ env.JOB_ID }} -h -o %T 2>/dev/null || echo 'COMPLETED'")
if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then
echo "Job completed successfully"
break
elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then
echo "Job failed with status: $JOB_STATUS"
exit 1
fi
# Stream logs in real-time
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"tail -n 50 $LOG_FILE 2>/dev/null || echo 'Waiting for log file...'"
echo "--- Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) min) ---"
sleep 30
done
- name: Retrieve Logs
if: always()
run: |
echo "Retrieving logs from cluster..."
mkdir -p ./logs
# Copy logs with retry
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.LOG_DIR }}/* ./logs/ 2>/dev/null; then
echo "Logs retrieved successfully"
break
fi
echo "Log retrieval attempt $i failed, retrying..."
sleep 10
done
- name: Upload logs as artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: regression-logs-${{ github.run_id }}-${{ matrix.model_config }}-${{ matrix.cluster }}
path: ./logs
retention-days: 60
- name: Cleanup
if: always()
run: |
echo "Cleaning up remote resources..."
# Cancel job if still running
if [ -n "${{ env.JOB_ID }}" ]; then
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"scancel ${{ env.JOB_ID }} 2>/dev/null || true"
fi
# Clean up directories
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
rm -rf ${{ env.REMOTE_TEST_PATH }}
rm -rf ${{ env.LOG_DIR }}
rm -rf ${{ env.CHECKPOINT_DIR }}
EOF
rm -rf ./logs
echo "Cleanup completed!"