Skip to content

Commit 98ffa86

Browse files
authored
Merge branch 'main' into patch-1
2 parents 1664922 + 7d81153 commit 98ffa86

File tree

149 files changed

+9848
-1476
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+9848
-1476
lines changed

.github/CODEOWNERS

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
# CODEOWNERS for aws-samples/awsome-distributed-training
2-
# Enforces that a member of the @aws-samples/sagemaker-hyperpod-dev team
3-
# must approve any PRs that modify files under either base-config directory,
4-
# including all nested subdirectories and files.
1+
# See https://help.github.com/articles/about-codeowners/
2+
53

6-
1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/** @aws-samples/hyperpod-customer-onbehalf-devs
7-
1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/** @aws-samples/hyperpod-customer-onbehalf-devs
4+
# Enforces that a member of the @aws-samples/sagemaker-hyperpod-dev team for HyperPod lifecycle scripts
5+
# They must approve any PRs that modify files under either base-config directory,
6+
# including all nested subdirectories and files.
7+
/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config @aws-samples/hyperpod-lcs-dev
8+
/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config @aws-samples/hyperpod-lcs-dev

.github/workflows/fsdp-regression-test-container.yml

Lines changed: 266 additions & 77 deletions
Large diffs are not rendered by default.

.github/workflows/fsdp-regression-test-venv.yml

Lines changed: 198 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,18 @@ on:
1313

1414
workflow_dispatch:
1515

16+
env:
17+
AWS_REGION: us-east-1
18+
SLURM_HOST: p5en.smml.aiml.aws.dev
19+
SLURM_USER: ghactions
20+
AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
21+
BASE_PATH: /fsx/agents/pr-reviews
22+
HOME_PATH: /home/ghactions
23+
24+
permissions:
25+
id-token: write
26+
contents: read
27+
1628
jobs:
1729
regression:
1830
strategy:
@@ -21,86 +33,222 @@ jobs:
2133
matrix:
2234
cluster: [p5, p5-smhp]
2335
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
24-
runs-on: [self-hosted, "${{ matrix.cluster }}"]
36+
runs-on: ubuntu-latest
2537
concurrency:
2638
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
2739
cancel-in-progress: false
28-
timeout-minutes: 360 # 6 hours for the full Llama 2 test
40+
timeout-minutes: 375
2941
steps:
3042
- name: Checkout code
3143
uses: actions/checkout@v4
3244
with:
33-
path: ${{ github.run_id }}
45+
path: source-code
3446

35-
- name: Set env vars
47+
- name: Configure AWS Credentials
48+
uses: aws-actions/configure-aws-credentials@v4
49+
with:
50+
role-to-assume: ${{ env.AWS_ROLE_ARN }}
51+
aws-region: ${{ env.AWS_REGION }}
52+
53+
- name: Setup SSH Key
3654
run: |
37-
HOME_DIR="/home/github"
38-
BUILD_ID="${{ github.run_id }}"
39-
FSDP_DIR="$(pwd)/${BUILD_ID}/3.test_cases/pytorch/FSDP"
40-
LOG_DIR="${HOME_DIR}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
41-
CHECKPOINT_DIR="${HOME_DIR}/regression-checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
55+
mkdir -p ~/.ssh
56+
echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
57+
chmod 600 ~/.ssh/slurm_key
58+
59+
# Add host to known hosts with retry
60+
for i in {1..5}; do
61+
if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then
62+
echo "SSH keyscan successful"
63+
break
64+
fi
65+
echo "SSH keyscan attempt $i failed, retrying..."
66+
sleep 5
67+
done
4268
43-
echo "HOME_DIR=$HOME_DIR" >> $GITHUB_ENV
44-
echo "BUILD_ID=$BUILD_ID" >> $GITHUB_ENV
45-
echo "FSDP_DIR=$FSDP_DIR" >> $GITHUB_ENV
69+
- name: Setup Environment Variables
70+
id: setup
71+
run: |
72+
BUILD_ID="${{ github.run_id }}"
73+
REMOTE_TEST_PATH="${{ env.BASE_PATH }}/venv-tests/${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
74+
LOG_DIR="${{ env.HOME_PATH }}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
75+
CHECKPOINT_DIR="${{ env.BASE_PATH }}/checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
76+
77+
echo "remote_test_path=$REMOTE_TEST_PATH" >> $GITHUB_OUTPUT
78+
echo "log_dir=$LOG_DIR" >> $GITHUB_OUTPUT
79+
echo "checkpoint_dir=$CHECKPOINT_DIR" >> $GITHUB_OUTPUT
80+
81+
echo "REMOTE_TEST_PATH=$REMOTE_TEST_PATH" >> $GITHUB_ENV
4682
echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV
4783
echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV
48-
echo "Env vars set successfully!"
4984
50-
- name: Create directories
85+
- name: Create Remote Directories
5186
run: |
52-
mkdir -p ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
53-
chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
54-
55-
- name: Create virtual environment
56-
working-directory: ${{ env.FSDP_DIR }}/slurm
87+
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
88+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
89+
mkdir -p ${{ env.REMOTE_TEST_PATH }}
90+
mkdir -p ${{ env.LOG_DIR }}
91+
mkdir -p ${{ env.CHECKPOINT_DIR }}
92+
chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
93+
EOF
94+
95+
- name: Transfer Code to Cluster
5796
run: |
58-
bash ./create_venv.sh
59-
echo "Virtual environment created successfully!"
97+
# Transfer code with retry
98+
for i in {1..3}; do
99+
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
100+
source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_TEST_PATH }}/; then
101+
echo "Code transfer successful"
102+
break
103+
fi
104+
echo "Transfer attempt $i failed, retrying..."
105+
sleep 10
106+
done
60107
61-
- name: Run regression test
62-
id: run_test
63-
working-directory: ${{ env.FSDP_DIR }}/slurm
108+
- name: Create Virtual Environment on Cluster
109+
run: |
110+
FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
111+
112+
echo "Creating virtual environment on cluster..."
113+
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
114+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
115+
set -e
116+
cd $FSDP_SLURM_DIR
117+
bash ./create_venv.sh
118+
echo "Virtual environment created successfully!"
119+
EOF
120+
121+
- name: Prepare and Submit Slurm Job
122+
id: submit_job
64123
env:
65124
HF_TOKEN: ${{ secrets.HF_TOKEN }}
66125
run: |
67-
source env/bin/activate
126+
FSDP_SLURM_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP/slurm"
68127
SBATCH_FILE="${{ matrix.model_config }}-training.sbatch"
69-
TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
70-
71-
if [ ! -f "$SBATCH_FILE" ]; then
72-
echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
73-
exit 1
74-
fi
128+
TMP_SBATCH="regression_test_${{ matrix.model_config }}.sbatch"
129+
130+
# Prepare and submit job on cluster
131+
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
132+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
133+
set -e
134+
cd $FSDP_SLURM_DIR
135+
136+
if [ ! -f "$SBATCH_FILE" ]; then
137+
echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
138+
exit 1
139+
fi
140+
141+
cp "$SBATCH_FILE" "$TMP_SBATCH"
142+
143+
# Modify sbatch script
144+
sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
145+
sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
146+
sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
147+
148+
# Activate venv in the sbatch script
149+
sed -i '1a source env/bin/activate' "$TMP_SBATCH"
150+
151+
# Submit job
152+
echo "Submitting Slurm job..."
153+
JOB_ID=\$(sbatch --parsable \$TMP_SBATCH)
154+
echo "JOB_ID=\$JOB_ID" >> ${{ env.REMOTE_TEST_PATH }}/job_info.txt
155+
echo "Submitted job: \$JOB_ID"
156+
EOF
157+
158+
# Get job ID
159+
sleep 2
160+
JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
161+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
162+
"cat ${{ env.REMOTE_TEST_PATH }}/job_info.txt | grep JOB_ID | cut -d= -f2")
163+
164+
echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT
165+
echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV
166+
echo "Submitted Slurm job: $JOB_ID"
75167

76-
cp "$SBATCH_FILE" "$TMP_SBATCH"
168+
- name: Monitor Job with Real-time Logs
169+
id: monitor_job
170+
run: |
171+
echo "Monitoring job ${{ env.JOB_ID }}..."
172+
START_TIME=$(date +%s)
173+
TIMEOUT=21600 # 6 hours
174+
175+
LOG_FILE="${{ env.LOG_DIR }}/regression_test_${{ env.JOB_ID }}.out"
176+
177+
while true; do
178+
CURRENT_TIME=$(date +%s)
179+
ELAPSED=$((CURRENT_TIME - START_TIME))
180+
181+
if [ $ELAPSED -gt $TIMEOUT ]; then
182+
echo "Timeout reached after 6 hours"
183+
exit 1
184+
fi
185+
186+
# Check job status
187+
JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
188+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
189+
"squeue -j ${{ env.JOB_ID }} -h -o %T 2>/dev/null || echo 'COMPLETED'")
190+
191+
if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then
192+
echo "Job completed successfully"
193+
break
194+
elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then
195+
echo "Job failed with status: $JOB_STATUS"
196+
exit 1
197+
fi
198+
199+
# Stream logs in real-time
200+
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
201+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
202+
"tail -n 50 $LOG_FILE 2>/dev/null || echo 'Waiting for log file...'"
203+
204+
echo "--- Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) min) ---"
205+
sleep 30
206+
done
77207
78-
sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
79-
sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
80-
sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=${{ env.CHECKPOINT_DIR }}|g" "$TMP_SBATCH"
208+
- name: Retrieve Logs
209+
if: always()
210+
run: |
211+
echo "Retrieving logs from cluster..."
212+
mkdir -p ./logs
213+
214+
# Copy logs with retry
215+
for i in {1..3}; do
216+
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
217+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.LOG_DIR }}/* ./logs/ 2>/dev/null; then
218+
echo "Logs retrieved successfully"
219+
break
220+
fi
221+
echo "Log retrieval attempt $i failed, retrying..."
222+
sleep 10
223+
done
81224
82-
echo "Submitting Slurm job..."
83-
sbatch --wait ${TMP_SBATCH}
84-
exit_code=$?
85-
echo "exit_code=$exit_code" >> $GITHUB_OUTPUT
86-
echo "Slurm job completed with exit code: $exit_code"
87-
if [ $exit_code -ne 0 ]; then
88-
echo "Slurm job failed with exit code: $exit_code"
89-
exit $exit_code
90-
fi
91-
92225
- name: Upload logs as artifacts
93226
if: always()
94227
uses: actions/upload-artifact@v4
95228
with:
96229
name: regression-logs-${{ github.run_id }}-${{ matrix.model_config }}-${{ matrix.cluster }}
97-
path: ${{ env.LOG_DIR }}
230+
path: ./logs
98231
retention-days: 60
99232

100233
- name: Cleanup
101234
if: always()
102235
run: |
103-
echo "Cleaning up..."
104-
rm -rf ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
105-
echo "Logs and checkpoints cleaned up successfully!"
106-
236+
echo "Cleaning up remote resources..."
237+
238+
# Cancel job if still running
239+
if [ -n "${{ env.JOB_ID }}" ]; then
240+
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
241+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
242+
"scancel ${{ env.JOB_ID }} 2>/dev/null || true"
243+
fi
244+
245+
# Clean up directories
246+
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
247+
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
248+
rm -rf ${{ env.REMOTE_TEST_PATH }}
249+
rm -rf ${{ env.LOG_DIR }}
250+
rm -rf ${{ env.CHECKPOINT_DIR }}
251+
EOF
252+
253+
rm -rf ./logs
254+
echo "Cleanup completed!"

0 commit comments

Comments
 (0)