1313
1414 workflow_dispatch :
1515
16+ env :
17+ AWS_REGION : us-east-1
18+ SLURM_HOST : p5en.smml.aiml.aws.dev
19+ SLURM_USER : ghactions
20+ AWS_ROLE_ARN : arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
21+ BASE_PATH : /fsx/agents/pr-reviews
22+ HOME_PATH : /home/ghactions
23+
24+ permissions :
25+ id-token : write
26+ contents : read
27+
1628jobs :
1729 regression :
1830 strategy :
@@ -21,86 +33,222 @@ jobs:
2133 matrix :
2234 cluster : [p5, p5-smhp]
2335 model_config : [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
24- runs-on : [self-hosted, "${{ matrix.cluster }}"]
36+ runs-on : ubuntu-latest
2537 concurrency :
2638 group : ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
2739 cancel-in-progress : false
28- timeout-minutes : 360 # 6 hours for the full Llama 2 test
40+ timeout-minutes : 375
2941 steps :
3042 - name : Checkout code
3143 uses : actions/checkout@v4
3244 with :
33- path : ${{ github.run_id }}
45+ path : source-code
3446
35- - name : Set env vars
47+ - name : Configure AWS Credentials
48+ uses : aws-actions/configure-aws-credentials@v4
49+ with :
50+ role-to-assume : ${{ env.AWS_ROLE_ARN }}
51+ aws-region : ${{ env.AWS_REGION }}
52+
53+ - name : Setup SSH Key
3654 run : |
37- HOME_DIR="/home/github"
38- BUILD_ID="${{ github.run_id }}"
39- FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP"
40- LOG_DIR="${HOME_DIR}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
41- CHECKPOINT_DIR="${HOME_DIR}/regression-checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
55+ mkdir -p ~/.ssh
56+ echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
57+ chmod 600 ~/.ssh/slurm_key
58+
59+ # Add host to known hosts with retry
60+ for i in {1..5}; do
61+ if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then
62+ echo "SSH keyscan successful"
63+ break
64+ fi
65+ echo "SSH keyscan attempt $i failed, retrying..."
66+ sleep 5
67+ done
4268
43- echo "HOME_DIR=$HOME_DIR" >> $GITHUB_ENV
44- echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV
45- echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV
69+ - name : Setup Environment Variables
70+ id : setup
71+ run : |
72+ BUILD_ID="${{ github.run_id }}"
73+ REMOTE_TEST_PATH="${{ env.BASE_PATH }}/venv-tests/${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
74+ LOG_DIR="${{ env.HOME_PATH }}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
75+ CHECKPOINT_DIR="${{ env.BASE_PATH }}/checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
76+
77+ echo "remote_test_path=$REMOTE_TEST_PATH" >> $GITHUB_OUTPUT
78+ echo "log_dir=$LOG_DIR" >> $GITHUB_OUTPUT
79+ echo "checkpoint_dir=$CHECKPOINT_DIR" >> $GITHUB_OUTPUT
80+
81+ echo "REMOTE_TEST_PATH=$REMOTE_TEST_PATH" >> $GITHUB_ENV
4682 echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV
4783 echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV
48- echo "Env vars set successfully!"
4984
50- - name : Create directories
85+ - name : Create Remote Directories
5186 run : |
52- mkdir -p ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
53- chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
54-
55- - name : Create virtual environment
56- working-directory : ${{ env.FSDP_DIR }}/slurm
87+ ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
88+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
89+ mkdir -p ${{ env.REMOTE_TEST_PATH }}
90+ mkdir -p ${{ env.LOG_DIR }}
91+ mkdir -p ${{ env.CHECKPOINT_DIR }}
92+ chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
93+ EOF
94+
95+ - name : Transfer Code to Cluster
5796 run : |
58- bash ./create_venv.sh
59- echo "Virtual environment created successfully!"
97+ # Transfer code with retry
98+ for i in {1..3}; do
99+ if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
100+ source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_TEST_PATH }}/; then
101+ echo "Code transfer successful"
102+ break
103+ fi
104+ echo "Transfer attempt $i failed, retrying..."
105+ sleep 10
106+ done
60107
61- - name : Run regression test
62- id : run_test
63- working-directory : ${{ env.FSDP_DIR }}/slurm
108+ - name : Create Virtual Environment on Cluster
109+ run : |
110+ FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
111+
112+ echo "Creating virtual environment on cluster..."
113+ ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
114+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
115+ set -e
116+ cd $FSDP_SLURM_DIR
117+ bash ./create_venv.sh
118+ echo "Virtual environment created successfully!"
119+ EOF
120+
121+ - name : Prepare and Submit Slurm Job
122+ id : submit_job
64123 env :
65124 HF_TOKEN : ${{ secrets.HF_TOKEN }}
66125 run : |
67- source env/bin/activate
126+ FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
68127 SBATCH_FILE="${{ matrix.model_config }}-training.sbatch"
69- TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
70-
71- if [ ! -f "$SBATCH_FILE" ]; then
72- echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
73- exit 1
74- fi
128+ TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
129+
130+ # Prepare and submit job on cluster
131+ ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
132+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
133+ set -e
134+ cd $FSDP_SLURM_DIR
135+
136+ if [ ! -f "$SBATCH_FILE" ]; then
137+ echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
138+ exit 1
139+ fi
140+
141+ cp "$SBATCH_FILE" "$TMP_SBATCH"
142+
143+ # Modify sbatch script
144+ sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
145+ sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
146+ sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
147+
148+ # Activate venv in the sbatch script
149+ sed -i '1a source env/bin/activate' "$TMP_SBATCH"
150+
151+ # Submit job
152+ echo "Submitting Slurm job..."
153+ JOB_ID=\$(sbatch --parsable \$TMP_SBATCH)
154+ echo "JOB_ID=\$JOB_ID" >> ${{ env.REMOTE_TEST_PATH }}/job_info.txt
155+ echo "Submitted job: \$JOB_ID"
156+ EOF
157+
158+ # Get job ID
159+ sleep 2
160+ JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
161+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
162+ " cat ${{ env.REMOTE_TEST_PATH }}/job_info.txt | grep JOB_ID | cut -d= -f2" )
163+
164+ echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT
165+ echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV
166+ echo "Submitted Slurm job : $JOB_ID"
75167
76- cp "$SBATCH_FILE" "$TMP_SBATCH"
168+ - name : Monitor Job with Real-time Logs
169+ id : monitor_job
170+ run : |
171+ echo "Monitoring job ${{ env.JOB_ID }}..."
172+ START_TIME=$(date +%s)
173+ TIMEOUT=21600 # 6 hours
174+
175+ LOG_FILE="${{ env.LOG_DIR }}/regression_test_${{ env.JOB_ID }}.out"
176+
177+ while true; do
178+ CURRENT_TIME=$(date +%s)
179+ ELAPSED=$((CURRENT_TIME - START_TIME))
180+
181+ if [ $ELAPSED -gt $TIMEOUT ]; then
182+ echo "Timeout reached after 6 hours"
183+ exit 1
184+ fi
185+
186+ # Check job status
187+ JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
188+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
189+ "squeue -j ${{ env.JOB_ID }} -h -o %T 2>/dev/null || echo 'COMPLETED'")
190+
191+ if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then
192+ echo "Job completed successfully"
193+ break
194+ elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then
195+ echo "Job failed with status: $JOB_STATUS"
196+ exit 1
197+ fi
198+
199+ # Stream logs in real-time
200+ ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
201+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
202+ "tail -n 50 $LOG_FILE 2>/dev/null || echo 'Waiting for log file...'"
203+
204+ echo "--- Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) min) ---"
205+ sleep 30
206+ done
77207
78- sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
79- sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
80- sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
208+ - name : Retrieve Logs
209+ if : always()
210+ run : |
211+ echo "Retrieving logs from cluster..."
212+ mkdir -p ./logs
213+
214+ # Copy logs with retry
215+ for i in {1..3}; do
216+ if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
217+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.LOG_DIR }}/* ./logs/ 2>/dev/null; then
218+ echo "Logs retrieved successfully"
219+ break
220+ fi
221+ echo "Log retrieval attempt $i failed, retrying..."
222+ sleep 10
223+ done
81224
82- echo "Submitting Slurm job..."
83- sbatch --wait ${TMP_SBATCH}
84- exit_code=$?
85- echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
86- echo "Slurm job completed with exit code: $exit_code"
87- if [ $exit_code -ne 0 ]; then
88- echo "Slurm job failed with exit code: $exit_code"
89- exit $exit_code
90- fi
91-
92225 - name : Upload logs as artifacts
93226 if : always()
94227 uses : actions/upload-artifact@v4
95228 with :
96229 name : regression-logs-${{ github.run_id }}-${{ matrix.model_config }}-${{ matrix.cluster }}
97- path : ${{ env.LOG_DIR }}
230+ path : ./logs
98231 retention-days : 60
99232
100233 - name : Cleanup
101234 if : always()
102235 run : |
103- echo "Cleaning up..."
104- rm -rf ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
105- echo "Logs and checkpoints cleaned up successfully!"
106-
236+ echo "Cleaning up remote resources..."
237+
238+ # Cancel job if still running
239+ if [ -n "${{ env.JOB_ID }}" ]; then
240+ ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
241+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
242+ "scancel ${{ env.JOB_ID }} 2>/dev/null || true"
243+ fi
244+
245+ # Clean up directories
246+ ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
247+ ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
248+ rm -rf ${{ env.REMOTE_TEST_PATH }}
249+ rm -rf ${{ env.LOG_DIR }}
250+ rm -rf ${{ env.CHECKPOINT_DIR }}
251+ EOF
252+
253+ rm -rf ./logs
254+ echo "Cleanup completed!"
0 commit comments