Skip to content

Fix typo in val_batch_size and remove unused imports (#908) #98

Fix typo in val_batch_size and remove unused imports (#908)

Fix typo in val_batch_size and remove unused imports (#908) #98

name: FSDP Regression Test (container)
# TODO: Additional configuration options as inputs (egs. number of nodes (auto change num GPUs and EFA variables accordingly), add support for g* instances etc)
on:
push:
branches: [ "main" ]
paths:
- '3.test_cases/pytorch/FSDP/**'
pull_request:
paths:
- '3.test_cases/pytorch/FSDP/**'
workflow_dispatch:
env:
AWS_REGION: us-east-1
SLURM_HOST: p5en.smml.aiml.aws.dev
SLURM_USER: ghactions
AWS_ROLE_ARN: arn:aws:iam::159553542841:role/awslabs-AOSH-GitHubActionsRole
BASE_PATH: /fsx/agents/pr-reviews
HOME_PATH: /home/ghactions
permissions:
id-token: write
contents: read
jobs:
build:
strategy:
fail-fast: true
max-parallel: 3
matrix:
cluster: [p5, p5-smhp]
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-build
cancel-in-progress: false
timeout-minutes: 75
outputs:
enroot_image_path: ${{ steps.build.outputs.enroot_image_path }}
remote_build_path: ${{ steps.setup.outputs.remote_build_path }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
path: source-code
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Setup SSH Key
id: setup
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
chmod 600 ~/.ssh/slurm_key
# Add host to known hosts with retry
for i in {1..5}; do
if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then
echo "SSH keyscan successful"
break
fi
echo "SSH keyscan attempt $i failed, retrying..."
sleep 5
done
REMOTE_BUILD_PATH="${{ env.BASE_PATH }}/container-builds/${{ github.run_id }}-${{ matrix.cluster }}"
echo "remote_build_path=$REMOTE_BUILD_PATH" >> $GITHUB_OUTPUT
echo "REMOTE_BUILD_PATH=$REMOTE_BUILD_PATH" >> $GITHUB_ENV
- name: Transfer Code to Cluster
run: |
# Create remote directory
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"mkdir -p ${{ env.REMOTE_BUILD_PATH }}"
# Transfer code with retry
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_BUILD_PATH }}/; then
echo "Code transfer successful"
break
fi
echo "Transfer attempt $i failed, retrying..."
sleep 10
done
- name: Build container on cluster
id: build
working-directory: source-code
run: |
FSDP_DIR="${{ env.REMOTE_BUILD_PATH }}/3.test_cases/pytorch/FSDP"
ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/fsdp-${{ github.run_id }}-${{ matrix.cluster }}.sqsh"
echo "Building FSDP image on cluster..."
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
set -e
cd $FSDP_DIR
echo "Building Docker image..."
docker build -t fsdp:pytorch .
echo "Converting to enroot image..."
mkdir -p ${{ env.BASE_PATH }}/enroot-images
enroot import -o $ENROOT_IMAGE dockerd://fsdp:pytorch
echo "Enroot image created at: $ENROOT_IMAGE"
# Clean up Docker image to save space
docker rmi fsdp:pytorch || true
EOF

Check failure on line 114 in .github/workflows/fsdp-regression-test-container.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/fsdp-regression-test-container.yml

Invalid workflow file

You have an error in your yaml syntax on line 114
echo "enroot_image_path=$ENROOT_IMAGE" >> $GITHUB_OUTPUT
echo "Container build completed successfully!"
- name: Cleanup Build Directory
if: always()
run: |
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"rm -rf ${{ env.REMOTE_BUILD_PATH }}" || true
run-tests:
needs: build
strategy:
fail-fast: false
max-parallel: 6
matrix:
cluster: [p5, p5-smhp]
model_config: [llama2_7b, llama2_13b, llama2_70b, llama3_1_8b, llama3_1_70b]
runs-on: ubuntu-latest
concurrency:
group: ${{ github.workflow }}-${{ matrix.cluster }}-${{ matrix.model_config }}
cancel-in-progress: false
timeout-minutes: 375
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
path: source-code
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ env.AWS_ROLE_ARN }}
aws-region: ${{ env.AWS_REGION }}
- name: Setup SSH Key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SLURM_SSH_KEY }}" > ~/.ssh/slurm_key
chmod 600 ~/.ssh/slurm_key
for i in {1..5}; do
if ssh-keyscan -H ${{ env.SLURM_HOST }} >> ~/.ssh/known_hosts 2>/dev/null; then
break
fi
echo "SSH keyscan attempt $i failed, retrying..."
sleep 5
done
- name: Setup Environment Variables
id: setup
run: |
BUILD_ID="${{ github.run_id }}"
REMOTE_TEST_PATH="${{ env.BASE_PATH }}/container-tests/${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
LOG_DIR="${{ env.HOME_PATH }}/regression-logs-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
CHECKPOINT_DIR="${{ env.BASE_PATH }}/checkpoints-${BUILD_ID}-${{ matrix.model_config }}-${{ matrix.cluster }}"
ENROOT_IMAGE="${{ env.BASE_PATH }}/enroot-images/fsdp-${{ github.run_id }}-${{ matrix.cluster }}.sqsh"
echo "remote_test_path=$REMOTE_TEST_PATH" >> $GITHUB_OUTPUT
echo "log_dir=$LOG_DIR" >> $GITHUB_OUTPUT
echo "checkpoint_dir=$CHECKPOINT_DIR" >> $GITHUB_OUTPUT
echo "enroot_image=$ENROOT_IMAGE" >> $GITHUB_OUTPUT
echo "REMOTE_TEST_PATH=$REMOTE_TEST_PATH" >> $GITHUB_ENV
echo "LOG_DIR=$LOG_DIR" >> $GITHUB_ENV
echo "CHECKPOINT_DIR=$CHECKPOINT_DIR" >> $GITHUB_ENV
echo "ENROOT_IMAGE=$ENROOT_IMAGE" >> $GITHUB_ENV
- name: Create Remote Directories
run: |
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
mkdir -p ${{ env.REMOTE_TEST_PATH }}
mkdir -p ${{ env.LOG_DIR }}
mkdir -p ${{ env.CHECKPOINT_DIR }}
chmod 755 ${{ env.LOG_DIR }} ${{ env.CHECKPOINT_DIR }}
EOF
- name: Transfer Code to Cluster
run: |
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
source-code/* ${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.REMOTE_TEST_PATH }}/; then
echo "Code transfer successful"
break
fi
echo "Transfer attempt $i failed, retrying..."
sleep 10
done
- name: Prepare and Submit Slurm Job
id: submit_job
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
FSDP_DIR="${{ env.REMOTE_TEST_PATH }}/3.test_cases/pytorch/FSDP"
SBATCH_FILE="slurm/${{ matrix.model_config }}-training.sbatch"
TMP_SBATCH="slurm/regression_test_${{ matrix.model_config }}_${{ matrix.cluster }}.sbatch"
# Prepare job script on cluster
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
set -e
cd $FSDP_DIR
if [ ! -f "$SBATCH_FILE" ]; then
echo "Error: sbatch file ${SBATCH_FILE} does not exist!"
exit 1
fi
cp "$SBATCH_FILE" "$TMP_SBATCH"
# Modify sbatch script
sed -i "s|#SBATCH --output=.*|#SBATCH --output=${{ env.LOG_DIR }}/regression_test_%j.out|" "$TMP_SBATCH"
sed -i "s|#SBATCH --error=.*|#SBATCH --error=${{ env.LOG_DIR }}/regression_test_%j.err|" "$TMP_SBATCH"
sed -i "s|#export CONTAINER_IMAGE=.*|export CONTAINER_IMAGE=${{ env.ENROOT_IMAGE }}|" "$TMP_SBATCH"
sed -i "s|--checkpoint_dir=./checkpoints|--checkpoint_dir=/checkpoints|g" "$TMP_SBATCH"
sed -i "s|--container-mounts.*|--container-mounts \\$FSX_MOUNT,${{ env.CHECKPOINT_DIR }}:/checkpoints|" "$TMP_SBATCH"
# Submit job
echo "Submitting Slurm job..."
JOB_ID=\$(sbatch --parsable $TMP_SBATCH)
echo "JOB_ID=\$JOB_ID" >> ${{ env.REMOTE_TEST_PATH }}/job_info.txt
echo "Submitted job: \$JOB_ID"
EOF
# Get job ID
sleep 2
JOB_ID=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"cat ${{ env.REMOTE_TEST_PATH }}/job_info.txt | grep JOB_ID | cut -d= -f2")
echo "job_id=$JOB_ID" >> $GITHUB_OUTPUT
echo "JOB_ID=$JOB_ID" >> $GITHUB_ENV
echo "Submitted Slurm job: $JOB_ID"
- name: Monitor Job with Real-time Logs
id: monitor_job
run: |
echo "Monitoring job ${{ env.JOB_ID }}..."
START_TIME=$(date +%s)
TIMEOUT=21600 # 6 hours
# Get initial log file name (will be updated once job starts)
LOG_FILE="${{ env.LOG_DIR }}/regression_test_${{ env.JOB_ID }}.out"
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [ $ELAPSED -gt $TIMEOUT ]; then
echo "Timeout reached after 6 hours"
exit 1
fi
# Check job status
JOB_STATUS=$(ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"squeue -j ${{ env.JOB_ID }} -h -o %T 2>/dev/null || echo 'COMPLETED'")
if [ -z "$JOB_STATUS" ] || [ "$JOB_STATUS" == "COMPLETED" ]; then
echo "Job completed successfully"
break
elif [ "$JOB_STATUS" == "FAILED" ] || [ "$JOB_STATUS" == "CANCELLED" ] || [ "$JOB_STATUS" == "TIMEOUT" ]; then
echo "Job failed with status: $JOB_STATUS"
exit 1
fi
# Stream logs in real-time
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"tail -n 50 $LOG_FILE 2>/dev/null || echo 'Waiting for log file...'"
echo "--- Job status: $JOB_STATUS (elapsed: $((ELAPSED / 60)) min) ---"
sleep 30
done
- name: Retrieve Logs
if: always()
run: |
echo "Retrieving logs from cluster..."
mkdir -p ./logs
# Copy logs with retry
for i in {1..3}; do
if scp -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no -o ConnectTimeout=30 -r \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }}:${{ env.LOG_DIR }}/* ./logs/ 2>/dev/null; then
echo "Logs retrieved successfully"
break
fi
echo "Log retrieval attempt $i failed, retrying..."
sleep 10
done
- name: Upload logs as artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: regression-logs-${{ matrix.model_config }}-${{ matrix.cluster }}-${{ github.run_id }}
path: ./logs
retention-days: 60
- name: Cleanup
if: always()
run: |
echo "Cleaning up remote resources..."
# Cancel job if still running
if [ -n "${{ env.JOB_ID }}" ]; then
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} \
"scancel ${{ env.JOB_ID }} 2>/dev/null || true"
fi
# Clean up directories
ssh -i ~/.ssh/slurm_key -o StrictHostKeyChecking=no \
${{ env.SLURM_USER }}@${{ env.SLURM_HOST }} << EOF
rm -rf ${{ env.REMOTE_TEST_PATH }}
rm -rf ${{ env.LOG_DIR }}
rm -rf ${{ env.CHECKPOINT_DIR }}
# Clean up enroot image if this is the last test for this build
if [ "${{ matrix.model_config }}" == "llama3_1_70b" ]; then
rm -f ${{ env.ENROOT_IMAGE }} || true
fi
EOF
rm -rf ./logs
echo "Cleanup completed!"